Below, the packages required for data analysis and visualization are loaded.
library(tidyverse)
library(magrittr)
library(DBI)
library(dbplyr)
library(RMariaDB)
library(data.table)
library(stopwords)
library(tidytext)
library(RColorBrewer)
library(DT)
library(wordcloud)
W. Edwards Deming said, “In God we trust, all others must bring data.” Below, we will use data to explore the question, “Which are the most valued data science skills?”
con <- DBI::dbConnect(
RMariaDB::MariaDB(),
dbname = "dat_sci_jobs",
username = "root",
password = as.character(read.table("sql_db.txt", header = FALSE)),
host = "35.227.102.234")
tables <- dbListTables(con)
jobs_df <- dbReadTable(con, "_Jobs")
datatable(jobs_df, options = list(pageLength = 25))
dbDisconnect(con)
completed_files <- readLines("completed_files.txt")
files <- list.files(pattern = "_feeds_.*csv$")
url_base <- "https://raw.githubusercontent.com/geedoubledee/data607_project3/main/"
new_jobs_df <- as.data.frame(matrix(nrow = 0, ncol = 11))
for (i in 1:length(files)){
if (!(files[i] %in% completed_files)){
file <- paste(url_base, files[i], sep = "")
csv <- read.csv(file = file, header = TRUE)
new_jobs_df <- rbind(new_jobs_df, csv)
completed_files <- append(completed_files, files[i])
}
}
writeLines(completed_files, "completed_files.txt")
new_jobs_df <- new_jobs_df[!duplicated(new_jobs_df), ]
if (nrow(new_jobs_df) > 0){
new_jobs_df <- subset(new_jobs_df, select = -c(X, author, summary,
content, extracted_content_url, published, created_at))
cols <- c("Job_id", "Site_id", "Job_title", "Job_url")
colnames(new_jobs_df) <- cols
rownames(new_jobs_df) <- NULL
new_jobs_df <- new_jobs_df[c("Job_id", "Job_title", "Job_url",
"Site_id")]
new_jobs_df %<>%
mutate(Job_complete = 0)
jobs_df <- rbindlist(list(jobs_df, new_jobs_df))[!duplicated(Job_url)]
jobs_df %<>%
mutate(Job_id = row_number())
}
for (i in 1:nrow(jobs_df)){
httr::user_agent("Glen Davis")
if (jobs_df[i, 5] == 0){
dat <- try(xml2::read_html(jobs_df$Job_url[[i]]), silent = TRUE)
if (inherits(dat, "try-error", which = FALSE)){
jobs_df[i, 5] <- -1
next
}
}else{
next
}
if (jobs_df[i, 4] == 2594160){ #ai-jobs.net is source
desc <- xml2::xml_find_all(
dat, "//div[contains(@id, 'job-description')]")
}
else if (jobs_df[i, 4] == 977141){ #python.org is source
desc <- xml2::xml_find_all(
dat, "//div[contains(@class, 'job-description')]")
}
else if (jobs_df[i, 4] == 2594162){ #careercast it & eng is source
desc <- xml2::xml_find_all(
dat, "//div[contains(@class, 'arDetailDescriptionRow')]")
}
else if (jobs_df[i, 4] == 1378327){ #jobs for r-users is source
desc <- xml2::xml_find_all(
dat, "//div[contains(@class, 'section_content')]")
}
else if (jobs_df[i, 4] == 2593879){ #Indeed is source
desc <- xml2::xml_find_all(
dat, "//div[contains(@class, 'jobsearch-jobDescriptionText')]")
}
else if (jobs_df[i, 4] == 2594166){ #Open Data Science is source
desc <- xml2::xml_find_all(
dat, "//div[contains(@class, 'job-desc')]")
}
else if (jobs_df[i, 4] == 2594174){ #MLconf is source
desc <- xml2::xml_find_all(
dat, "//div[contains(@class, 'job_description')]")
}
desc <- xml2::xml_text(desc)
fn <- paste(jobs_df[i, 1], ".txt", sep = "")
writeLines(desc, con = fn)
jobs_df[i, 5] <- 1
}
con <- DBI::dbConnect(
RMariaDB::MariaDB(),
dbname = "dat_sci_jobs",
username = "root",
password = as.character(read.table("sql_db.txt", header = FALSE)),
host = "35.227.102.234")
tables <- dbListTables(con)
dbWriteTable(con, "_Jobs", jobs_df, overwrite = TRUE)
dbDisconnect(con)
files <- list.files(pattern = "^[1-9]+.*txt$")
completed_txt_files <- readLines("completed_txt_files.txt")
text_df <- read.csv("text_df.csv")
if (length(files) > 0){
file.copy(from = paste0(getwd(), "/", files),
to = paste0(getwd(), "/jobs-txt/", files))
file.remove(from = paste0(getwd(), "/", files))
url_base <- "https://raw.githubusercontent.com/geedoubledee/data607_project3/main/jobs-txt/"
new_text_df <- as.data.frame(matrix(nrow = 0, ncol = 3))
cols <- c("Text", "Job_id", "Line")
colnames(new_text_df) <- cols
for (i in 1:length(files)){
if (!(files[i] %in% completed_txt_files)){
file <- paste(url_base, files[i], sep = "")
job_id <- str_replace(files[i], ".txt", "")
lines <- readLines(file)
for (j in 1:length(lines)){
col2 <- matrix(job_id, nrow = length(lines), ncol = 1)
col3 <- matrix(1:length(lines),
nrow = length(lines),
ncol = 1)
}
completed_txt_files <- append(completed_txt_files, files[i])
addition <- cbind(lines, col2, col3)
colnames(addition) <- cols
text_df <- rbind(text_df, addition)
}
}
writeLines(completed_txt_files, "completed_txt_files.txt")
text_df %<>%
mutate(Text_id = row_number())
rownames(text_df) <- NULL
text_df <- text_df[c("Text_id", "Text", "Job_id", "Line")]
write.csv(text_df, "text_df.csv")
}
tidy_text_df_words <- text_df %>%
unnest_tokens(word, Text)
tidy_text_words_analysis <- tidy_text_df_words %>%
anti_join(get_stopwords()) %>%
count(word, sort = TRUE)
## Joining with `by = join_by(word)`
datatable(tidy_text_words_analysis, options = list(pageLength = 25))
## Warning in instance$preRenderHook(instance): It seems your data is too big for
## client-side DataTables. You may consider server-side processing:
## https://rstudio.github.io/DT/server.html
tidy_text_words_analysis %>%
with(wordcloud(word, n, max.words = 50))
tidy_text_df_bigrams <- text_df %>%
unnest_tokens(bigram, Text, token = "ngrams", n = 2)
tidy_text_bigrams_analysis <- tidy_text_df_bigrams %>%
count(bigram, sort = TRUE)
datatable(tidy_text_bigrams_analysis, options = list(pageLength = 25))
## Warning in instance$preRenderHook(instance): It seems your data is too big for
## client-side DataTables. You may consider server-side processing:
## https://rstudio.github.io/DT/server.html
tidy_text_bigrams_analysis %>%
with(wordcloud(bigram, n, max.words = 50))